AndrewNg 机器学习习题ex5-bias vs variance
ex5data1.mat文件储存了大坝出水量的数据,由三部分组成:
- 训练集:X,y
- 交叉验证集:Xval,yval
- 测试集:Xtest,ytest
需要的头:1
2
3
4
5
6import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
数据预处理
画出训练集的散点图,给特征集加一列1.1
2
3
4
5
6
7
8
9
10
11
12
13def load_data():
d = sio.loadmat('./data/ex5data1.mat')
return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])
X, y, Xval, yval, Xtest, ytest = load_data()
df = pd.DataFrame({'water_level': X, 'flow': y})
print(df.shape)
sns.lmplot('water_level', 'flow', data=df, fit_reg=False)
plt.show()
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
# print(X, Xval, Xtest )
正则化
代价函数是:
梯度下降:
${\theta_{j}}:={\theta_{j}}-\alpha \frac{\partial }{\partial {\theta_{j}}}J\left(\theta \right)$
正则化线性回归的代价函数为:
$J\left( \theta \right)=\frac{1}{2m}\sum\limits_{i=1}^{m}{[({{({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})}^{2}}+\lambda \sum\limits_{j=1}^{n}{\theta _{j}^{2}})]}$
如果我们要使用梯度下降法令这个代价函数最小化,因为我们未对$\theta_0$进行正则化,所以梯度下降算法将分两种情形:
$Repeat$ $until$ $convergence${
${\theta_0}:={\theta_0}-a\frac{1}{m}\sum\limits_{i=1}^{m}{(({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})x_{0}^{(i)}})$
${\theta_j}:={\theta_j}-a[\frac{1}{m}\sum\limits_{i=1}^{m}{(({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})x_{j}^{\left( i \right)}}+\frac{\lambda }{m}{\theta_j}]$
$for$ $j=1,2,...n$
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37# 代价函数
def cost(theta, X, y):
m = X.shape[0]
inner = X @ theta - y # R(m+1)
# 1*m @ m*1 = 1*1 矩阵乘法
# 一维矩阵的转置乘以它自己等于每个元素的平方和
return inner.T @ inner / (2 * m)
print(cost(theta, X, y,))
# 303.9515255535976
# 梯度
def gradient(theta, X, y):
m = X.shape[0]
return X.T @ (X @ theta - y) / m # (m, n).T @ (m, 1) -> (n, 1)
print(gradient(theta, X, y,))
# [-15.30301567 598.16741084]
# 正则化
def regularized_cost(theta, X, y, l=1):
return cost(theta, X, y) + (l / (2 * X.shape[0])) * np.power(theta[1:], 2).sum()
def regularized_gradient(theta, X, y, l=1):
m = X.shape[0]
regularized_term = theta.copy()
regularized_term[0] = 0
regularized_term = (l / m) * regularized_term
return gradient(theta, X, y) + regularized_term
print(regularized_gradient(theta, X, y, l=1))
# [-15.30301567 598.25074417]
训练数据
正则化项 $\lambda=0$1
2
3
4
5
6
7
8
9
10
11
12
13def linear_regression_np(theta, X, y, l=1):
res = opt.fmin_tnc(func=regularized_cost, x0=theta, fprime=regularized_gradient, args=(X, y))
return res
final_theta = linear_regression_np(theta, X, y)[0]
b = final_theta[0]
m = final_theta[1]
plt.scatter(X[:, 1], y, label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, label='Prediction')
plt.legend(loc=2)
plt.show()
学习曲线
1 | def plot_learning_curve(X, y, Xval, yval, l=0): |
观察学习曲线发现拟合的不太好,欠拟合。很显然我们的模型不优秀,改为多项式特征尝试。
多项式特征
把特征扩展到8阶,然后归一化特征值。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28def poly_features(x, power, as_ndarray=False):
data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)
return df.as_matrix() if as_ndarray else df
# 归一化特征值,减去平均数除以标准差
def normalize_feature(df):
"""Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())
def prepare_poly_data(*args, power):
"""
args: keep feeding in X, Xval, or Xtest
will return in the same order
"""
def prepare(x):
# expand feature
df = poly_features(x, power=power)
# normalization
ndarr = normalize_feature(df).as_matrix()
# add intercept term
return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)
return [prepare(x) for x in args]
尝试不同的λ来观察学习曲线1
2
3
4
5
6
7
8
9X, y, Xval, yval, Xtest, ytest = load_data()
X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()
plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()
plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)
plt.show()
当λ取0时,也就是没有正则项时,可以看到训练的代价太低了,不真实. 这是 过拟合了
当训练代价增加了些,不再是0了。 稍减轻了过拟合
当λ取100时,正则化过多,变成了欠拟合。
最优λ
1 | # 找到最佳拟合 |
1 | test cost(l=0) = 9.799399498688892 |
调参后, lambda = 0.3 是最优选择,这个时候测试代价最小